Loading the required data
daily.counts <- read.csv("Intermediate_results/regularity_of_study/weekly_counts_of_daily_logins_w2-13.csv")
#colnames(daily.counts)
weekly.counts <- daily.counts %>%
select(user_id, W2_cnt:W13_cnt, tot_cnt, weekly_entropy)
# str(weekly.counts)
daily.gaps <- read.csv("Intermediate_results/regularity_of_study/gaps_between_consecutive_logins_w2-13.csv")
# str(daily.gaps)
# daily gaps do not have normal distribution, so, median will be used
# merge weekly counts and median time gap
counts.data <- merge(x = weekly.counts, y = daily.gaps %>% select(user_id, median_gap),
by = 'user_id', all = TRUE)
exam.scores <- read.csv(file = "Intermediate_results/exam_scores_with_student_ids.csv")
# remove email data
exam.scores <- exam.scores %>% select(-2)
# str(exam.scores)
# merge counts data with exam scores
counts.data <- merge(x = counts.data, y = exam.scores, by.x = 'user_id', by.y = 'USER_ID',
all.x = T, all.y = F)
#summary(counts.data)
# 9 NA values for exam scores; remove them
counts.data <- counts.data %>% filter( is.na(SC_FE_TOT)==FALSE )
This means that predictors are counts of active days (days when a student had at least one learning session) per week, entropy of weekly active days, and median gap between two consecutive active days.
lm1.data <- counts.data %>% select(-c(tot_cnt, user_id, SC_MT_TOT))
lm1 <- lm(SC_FE_TOT ~ ., data = lm1.data)
summary(lm1)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm1.data)
Residuals:
Min 1Q Median 3Q Max
-23.2397 -6.3099 -0.6918 5.9104 19.9677
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.2257 2.3354 3.522 0.00047 ***
W2_cnt 0.4985 0.3431 1.453 0.14685
W3_cnt 0.0926 0.3651 0.254 0.79991
W4_cnt 0.1341 0.3532 0.380 0.70441
W5_cnt -0.3261 0.3228 -1.010 0.31290
W6_cnt -0.1632 0.3676 -0.444 0.65736
W7_cnt 0.5343 0.4188 1.276 0.20266
W8_cnt 1.0919 0.4068 2.685 0.00752 **
W9_cnt 0.1106 0.3862 0.286 0.77475
W10_cnt 1.0810 0.3454 3.130 0.00186 **
W11_cnt 0.1618 0.3891 0.416 0.67768
W12_cnt 0.4559 0.3298 1.383 0.16746
W13_cnt 0.8561 0.2901 2.951 0.00333 **
weekly_entropy 11.6096 9.9681 1.165 0.24475
median_gap -0.2072 0.3777 -0.549 0.58350
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.417 on 462 degrees of freedom
Multiple R-squared: 0.2831, Adjusted R-squared: 0.2614
F-statistic: 13.03 on 14 and 462 DF, p-value: < 2.2e-16
It’s interesting that counts for only 3 weeks are significant and that all three weeks are in the 2nd part of the course (after midterm exam):
R-squared is 0.283 (adjusted R2: 0.261).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm1$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm1)
par(mfrow=c(1,1)) # Change back to 1 x 1
# there are few potential influential points: 80, 50, 459
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:14)
print(cor.test(lm1.data[,c], lm1$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm1)
# OK, values below or equal to 2
The assumptions are satisifed, though there are few potentially influential points that might need to be considered if this model is to be used
lm2.data <- counts.data %>% select(tot_cnt, median_gap, weekly_entropy, SC_FE_TOT)
lm2 <- lm(SC_FE_TOT ~ ., data = lm2.data)
summary(lm2)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm2.data)
Residuals:
Min 1Q Median 3Q Max
-25.649 -6.003 -0.660 5.943 20.407
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.84084 2.17939 2.680 0.00762 **
tot_cnt 0.40120 0.04530 8.857 < 2e-16 ***
median_gap -0.05991 0.37409 -0.160 0.87284
weekly_entropy 11.87966 9.90829 1.199 0.23114
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.476 on 473 degrees of freedom
Multiple R-squared: 0.2557, Adjusted R-squared: 0.251
F-statistic: 54.17 on 3 and 473 DF, p-value: < 2.2e-16
The total number of active days is the only significant predictor, and it is highly significant. Each additional active day contributes 0.4 points to the final exam score.
R-squared is 0.2557 (adjusted R2: 0.251).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm2$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm2)
par(mfrow=c(1,1)) # Change back to 1 x 1
# both OK
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:3)
print(cor.test(lm2.data[,c], lm2$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm2)
# OK, values below or equal to 2
All assumptions are satisified.
Loading the required data
weekly.sessions <- read.csv("Intermediate_results/regularity_of_study/weekly_session_props.csv")
#str(weekly.sessions)
ses.gap.data <- read.csv("Intermediate_results/regularity_of_study/inter-session_time_intervals.csv") #str(ses.gap.data)
lm3.data <- merge(x = weekly.sessions %>% select(count_w2:count_w12, weekly_entropy, user_id),
y = ses.gap.data %>% select(user_id, median_s_gap),
by = 'user_id', all = TRUE)
lm3.data <- merge(x = lm3.data, y = exam.scores %>% select(USER_ID, SC_FE_TOT),
by.x = 'user_id', by.y = 'USER_ID', all.x = T, all.y = F)
summary(lm3.data)
## remove rows with NAs
lm3.data <- lm3.data %>% filter(is.na(SC_FE_TOT)==FALSE & is.na(median_s_gap)==FALSE)
# ## since some of the predictors are on quite different scales, rescale them
# apply(lm3.data %>% select(-user_id), 2, shapiro.test)
# apply(lm3.data %>% select(-user_id), 2, function(x) length(boxplot.stats(x)$out))
# # all preditors have outliers -> normalization is not advised
# lm3.sc.data <- scale.features(lm3.data %>% select(-c(user_id, SC_FE_TOT)))
# lm3.sc.data <- cbind(lm3.sc.data, SC_FE_TOT=lm3.data$SC_FE_TOT)
# summary(lm3.sc.data)
## the same results are obtained with scaled and unscaled (original) data;
## will keep the results with original data as they are easier to interpret
lm3.data <- lm3.data %>% select(-user_id)
lm3 <- lm(SC_FE_TOT ~ ., data = lm3.data)
summary(lm3)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm3.data)
Residuals:
Min 1Q Median 3Q Max
-21.6292 -5.9597 -0.8594 5.9990 21.2523
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 15.2754010 1.6511046 9.252 < 2e-16 ***
count_w2 0.0053628 0.1110213 0.048 0.96149
count_w3 0.1909662 0.1157368 1.650 0.09962 .
count_w4 -0.0832153 0.1005015 -0.828 0.40810
count_w5 -0.2802013 0.0866320 -3.234 0.00131 **
count_w7 0.2530987 0.1498286 1.689 0.09184 .
count_w8 0.1688238 0.1441518 1.171 0.24214
count_w9 0.1951703 0.1603609 1.217 0.22420
count_w10 0.5158746 0.1080020 4.777 2.4e-06 ***
count_w11 0.3756283 0.1545471 2.431 0.01546 *
count_w12 0.1057666 0.1333451 0.793 0.42808
weekly_entropy 25.4435546 8.4577490 3.008 0.00277 **
median_s_gap -0.0001634 0.0003508 -0.466 0.64167
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.31 on 463 degrees of freedom
Multiple R-squared: 0.2938, Adjusted R-squared: 0.2755
F-statistic: 16.05 on 12 and 463 DF, p-value: < 2.2e-16
Significant predictors:
R-squared: 0.294 (adjusted R2: 0.275).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm3$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm3)
par(mfrow=c(1,1)) # Change back to 1 x 1
# mostly fine, but there are few (potentially) influential points: 412, 459, 437, 77
# let's examine them
lm3.data[c(412,459,437,77),]
summary(lm3.data)
# 437 has very low engagement and very high exam score (35)
# 459 has high engagement (at times very high) and zero (0) exam score
# 412 is similar to 459, but not that extreme (7 exam score; less active)
# 77 is almost completely inactive, and has zero (0) exam score
## assumption 4: predictors and residuals are uncorrelated
lm3.data <- lm3.data %>% filter(is.na(median_s_gap)==FALSE)
for(c in 1:12)
print(cor.test(lm3.data[,c], lm3$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm3)
# OK, values below or slightly above 2
lm4.data <- merge(x = weekly.sessions %>% select(user_id, s_total, weekly_entropy),
y = ses.gap.data %>% select(-mad_s_gap),
by = 'user_id', all = TRUE)
lm4.data <- merge(x = lm4.data, y = exam.scores %>% select(USER_ID, SC_FE_TOT),
by.x = 'user_id', by.y = 'USER_ID', all.x = T, all.y = F)
lm4.data <- lm4.data %>% filter( is.na(SC_FE_TOT)==FALSE ) %>% select(-user_id)
lm4 <- lm(SC_FE_TOT ~ ., data = lm4.data)
summary(lm4)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm4.data)
Residuals:
Min 1Q Median 3Q Max
-29.4102 -6.3591 -0.7392 6.5142 19.4481
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.526e+01 1.658e+00 9.205 < 2e-16 ***
s_total 1.235e-01 1.383e-02 8.926 < 2e-16 ***
weekly_entropy 3.188e+01 8.598e+00 3.708 0.000233 ***
median_s_gap 2.975e-05 3.607e-04 0.082 0.934310
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.599 on 472 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.2292, Adjusted R-squared: 0.2243
F-statistic: 46.78 on 3 and 472 DF, p-value: < 2.2e-16
Significant predictors:
R-squared: 0.229 (adjusted R2: 0.224).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm4$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm4)
par(mfrow=c(1, 1))
# the Residuals vs Fitted plot suggests that there might be some non-linear realtionship between the outcome and the predictors
# there are also few influential points: 412, 459, 376, 77
# let's examine them
lm4.data[c(412,459,376, 77),]
summary(lm4.data)
# 77 is a clear outlier
# 376 has relatively high engagement (above 3rd quartile), but very low exam score (4)
# 412 and 459 have already been examined before
## assumption 4: predictors and residuals are uncorrelated
lm4.data <- lm4.data %>% filter(is.na(median_s_gap)==FALSE)
for(c in 1:3)
print(cor.test(lm4.data[,c], lm4$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm4)
# OK, values below 2
Loading the data
weekday.sessions <- read.csv("Intermediate_results/regularity_of_study/weekday_session_props.csv")
#str(weekday.sessions)
lm5.data <- merge(x = weekday.sessions %>% select(1:8, 11),
y = exam.scores %>% select(-SC_MT_TOT),
by.x = "user_id", by.y = "USER_ID",
all.x = TRUE, all.y = FALSE)
# summary(lm5.data)
lm5.data <- lm5.data %>% filter( is.na(SC_FE_TOT)==FALSE ) %>% select(-user_id)
lm5 <- lm(SC_FE_TOT ~ ., data = lm5.data)
summary(lm5)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm5.data)
Residuals:
Min 1Q Median 3Q Max
-31.2357 -6.1439 -0.9892 6.7715 20.2779
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 15.07048 2.26948 6.641 8.69e-11 ***
Sun_count 0.08458 0.09785 0.864 0.387818
Mon_count 0.20397 0.04877 4.182 3.44e-05 ***
Tue_count 0.11681 0.03721 3.140 0.001799 **
Wed_count 0.10067 0.04187 2.405 0.016578 *
Thu_count 0.16141 0.04133 3.905 0.000108 ***
Fri_count 0.12135 0.10226 1.187 0.235970
Sat_count -0.02262 0.12568 -0.180 0.857269
weekday_entropy 17.61256 6.86108 2.567 0.010567 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.663 on 468 degrees of freedom
Multiple R-squared: 0.2307, Adjusted R-squared: 0.2176
F-statistic: 17.55 on 8 and 468 DF, p-value: < 2.2e-16
Significant predictors:
R-squared: 0.231 (adjusted R2: 0.218).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm5$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm5)
par(mfrow=c(1,1)) # Change back to 1 x 1
# mostly fine, but there are few potentially influential points: usual suspects (412, 459), 230
# let's examine them
lm5.data[c(412,459,230),]
summary(lm5.data)
# 230 has low engagement and very high exam score (35)
# 459 and 412 have already been considered
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:8)
print(cor.test(lm5.data[,c], lm5$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm5)
# OK, values below 2
As predictors, use total counts of different kinds of resources students used during their active days (an active day is a day when a student had at least one study session). The types of resources considered:
In addition, consider using:
Loading the data…
res.use.stats <- read.csv("Intermediate_results/regularity_of_study/daily_resource_use_statistics_w2-5_7-12.csv")
#str(res.use.stats)
lm6.data <- merge(res.use.stats, exam.scores, by.x = "user_id", by.y = "USER_ID", all.x = T, all.y = F)
lm6.data <- lm6.data %>% select(-c(user_id, SC_MT_TOT)) %>% filter( is.na(SC_FE_TOT)==FALSE )
lm6_1.data <- lm6.data %>% select( starts_with("tot"), starts_with("prop"), SC_FE_TOT)
# examine the presence of (high) correlation between the variables
ggcorr(lm6_1.data, method = c("complete","spearman"),
# geom = "circle", min_size = 0, max_size = 15,
label = TRUE, label_size = 3.5,
hjust = 0.85, size = 4, layout.exp = 1)
# tot_mcog_cnt and prop_mcog_used are highly correlated, as are tot_video_cnt and prop_video_used, and tot_mcq_cnt and prop_mcq_used
lm6_1.data <- lm6_1.data %>% select(-c(prop_mcog_used, prop_video_used, prop_mcq_used))
# remove the outliers and re-run the model
lm6_1.data <- lm6_1.data[-c(86, 412, 462, 459),]
lm6_1 <- lm(SC_FE_TOT ~., data = lm6_1.data)
summary(lm6_1)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm6_1.data)
Residuals:
Min 1Q Median 3Q Max
-22.0996 -5.9578 -0.1087 6.5627 20.6982
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.505e+01 2.572e+01 2.529 0.0118 *
tot_video_cnt 1.314e-03 8.603e-04 1.528 0.1273
tot_exe_cnt -6.820e-03 1.328e-03 -5.135 4.16e-07 ***
tot_mcq_cnt 8.755e-03 3.661e-03 2.392 0.0172 *
tot_mcog_cnt 9.811e-03 1.281e-02 0.766 0.4441
tot_res_cnt 1.008e-02 1.955e-03 5.156 3.74e-07 ***
prop_exe_used 4.138e-01 3.661e+00 0.113 0.9100
prop_res_used -4.626e+01 2.576e+01 -1.796 0.0732 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.798 on 465 degrees of freedom
Multiple R-squared: 0.1922, Adjusted R-squared: 0.1801
F-statistic: 15.81 on 7 and 465 DF, p-value: < 2.2e-16
Significant predictors:
R-squared: 0.192 (adjusted R2: 0.180).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm6_1$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm6_1)
par(mfrow=c(1, 1))
# unclear if homoscedasticity requirement is fulfilled; check using this plot:
check.homoschedasticity(lm6_1)
# not that good
# # the plots point to couple of outliers: 86, 412, 462, 459
# # let's check them:
# lm6_1.data[c(86, 412, 462, 459),]
# summary(lm6_1.data)
# # 459 and 462 have zero exam score, inspite of non-negligible number of learning events (especially 459)
# # 412 was highly active, but had very low exam score (7)
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:7)
print(cor.test(lm6_1.data[,c], lm6_1$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm6_1)
# it's fine: all below or equal to 2
The assumption of homoscedasticity cannot be considered satisfied (even after removing outliers)
# include those engagment indicators that proved at least slightly relevant in the previous model
# plus mad_X_cnt as indicators of regularity
lm6_2.data <- lm6.data %>% select(tot_mcq_cnt, tot_exe_cnt, tot_res_cnt, prop_res_used,
starts_with("mad"), SC_FE_TOT)
# examine the presence of (high) correlation between the variables
plot.correlations(lm6_2.data)
# exclude mad_res_cnt as highly correlated with tot_res_cnt (which proved significant)
lm6_2.data <- lm6_2.data %>% select(-mad_res_cnt)
lm6_2 <- lm(SC_FE_TOT ~., data = lm6_2.data)
summary(lm6_2)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm6_2.data)
Residuals:
Min 1Q Median 3Q Max
-22.9335 -6.1144 -0.1174 6.8471 23.1086
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 66.127636 26.170686 2.527 0.011840 *
tot_mcq_cnt 0.012136 0.003640 3.334 0.000923 ***
tot_exe_cnt -0.006543 0.001360 -4.813 2.01e-06 ***
tot_res_cnt 0.010416 0.002000 5.208 2.87e-07 ***
prop_res_used -47.054649 26.296683 -1.789 0.074201 .
mad_video_cnt -0.007622 0.081861 -0.093 0.925861
mad_exe_cnt -0.003117 0.020326 -0.153 0.878197
mad_mcq_cnt -0.543694 0.379258 -1.434 0.152362
mad_mcog_cnt -0.952299 1.089663 -0.874 0.382599
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.983 on 468 degrees of freedom
Multiple R-squared: 0.1728, Adjusted R-squared: 0.1586
F-statistic: 12.22 on 8 and 468 DF, p-value: 6.383e-16
None of the MAD variables is significant
As predictors, use total number of learning actions (during active days) with a particular topic focus; possible topic foci:
In addition, consider including the following basic statistics:
Loading the required data…
topic.stats <- read.csv("Intermediate_results/regularity_of_study/topic_counts_statistics_w2-5_7-12.csv")
# str(topic.stats)
lm7.data <- merge(topic.stats, exam.scores, by.x = "user_id", by.y = "USER_ID", all.x = T, all.y = F)
lm7.data <- lm7.data %>% select(-c(user_id, SC_MT_TOT)) %>% filter( is.na(SC_FE_TOT)==FALSE )
lm7_1.data <- lm7.data %>% select( starts_with("tot"), ends_with("prop"), SC_FE_TOT)
summary(lm7_1.data)
# examine the presence of (high) correlation between the variables
plot.correlations(lm7_1.data)
# exclude tot_orient_cnt and orinet_prop as they are highly correlated with some other variables
lm7_1.data <- lm7_1.data %>% select(-c(tot_orient_cnt, orient_prop))
# exclude tot_prj_cnt, due to high VIF
lm7_1.data <- lm7_1.data %>% select(-tot_prj_cnt)
lm7_1 <- lm(SC_FE_TOT ~ ., data = lm7_1.data)
summary(lm7_1)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm7_1.data)
Residuals:
Min 1Q Median 3Q Max
-22.4947 -6.5546 -0.6803 6.5096 21.4971
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.592e+01 1.040e+01 4.413 1.27e-05 ***
tot_ontopic_cnt 4.854e-03 9.985e-04 4.861 1.60e-06 ***
tot_revisit_cnt -5.237e-03 1.455e-03 -3.600 0.000352 ***
tot_metacog_cnt 1.191e-02 3.565e-03 3.341 0.000901 ***
ontopic_prop 8.082e-01 3.322e+00 0.243 0.807890
revisit_prop 1.940e+00 3.106e+00 0.625 0.532537
metacog_prop -3.469e+01 1.017e+01 -3.410 0.000705 ***
prj_prop 4.500e+00 4.893e+00 0.920 0.358177
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 9.063 on 469 degrees of freedom
Multiple R-squared: 0.1562, Adjusted R-squared: 0.1436
F-statistic: 12.4 on 7 and 469 DF, p-value: 1.347e-14
Significant predictors:
R-squared: 0.156 (adjusted R2: 0.145).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm7_1$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm7_1)
par(mfrow=c(1, 1))
# normality is fine
# unclear if homoscedasticity requirement is fulfilled; check using this plot:
check.homoschedasticity(lm7_1)
# it's fine
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:8)
print(cor.test(lm7_1.data[,c], lm7_1$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm7_1)
# now, it's fine
# include those engagment indicators that proved at least slightly relevant in the previous model
# plus mad_X_cnt as indicators of regularity
lm7_2.data <- lm7.data %>% select(tot_ontopic_cnt, tot_revisit_cnt, tot_metacog_cnt, metacog_prop,
starts_with("mad"), SC_FE_TOT)
# examine the presence of (high) correlation between the variables
plot.correlations(lm7_2.data)
# exclude tot_metacog_cnt as highly correlated with mad_metacog_cnt and mad_orient_cnt
lm7_2.data <- lm7_2.data %>% select(-c(tot_metacog_cnt, mad_orient_cnt))
lm7_2 <- lm(SC_FE_TOT ~., data = lm7_2.data)
summary(lm7_2)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm7_2.data)
Residuals:
Min 1Q Median 3Q Max
-20.586 -6.413 -1.047 6.841 22.230
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.254e+01 9.826e+00 4.330 1.83e-05 ***
tot_ontopic_cnt 7.177e-03 9.323e-04 7.698 8.27e-14 ***
tot_revisit_cnt -3.802e-03 1.352e-03 -2.812 0.00513 **
metacog_prop -2.679e+01 1.019e+01 -2.629 0.00885 **
mad_ontopic_cnt -1.257e-01 4.010e-02 -3.135 0.00182 **
mad_revisit_cnt -9.064e-02 7.364e-02 -1.231 0.21897
mad_metacog_cnt -3.913e-02 1.476e-01 -0.265 0.79102
mad_prj_cnt -7.659e-01 4.124e+00 -0.186 0.85276
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 9.1 on 469 degrees of freedom
Multiple R-squared: 0.1493, Adjusted R-squared: 0.1366
F-statistic: 11.76 on 7 and 469 DF, p-value: 8.013e-14
The only regularity indicator that proved significant: mad_ontopic_cnt - one unit increase in MAD of ontopic counts leads to a decrease of 0.126 points in the final exam score
R2: 0.1493 (adjusted R2: 0.1366).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm7_2$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm7_2)
par(mfrow=c(1, 1))
# normality is fine
# unclear if homoscedasticity requirement is fulfilled; check using this plot:
check.homoschedasticity(lm7_2)
# not bad
# a few influential points: 60, 54, 202
# and a few outliers: 19, 294, 50
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:7)
print(cor.test(lm7_2.data[,c], lm7_2$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm7_2)
# OK
A few outliers and (potentially) influential points; apart from that, it’s fine
Indicators are computed at the week level, based on the following principle: a score of one is given to a student (for a given week), if he/she used certain kind of resource (e.g. video) more than the average (median) use of the that resource type in the given week
Loading the data…
res.use.ind <- read.csv("Intermediate_results/regularity_of_study/res_use_indicators_w2-13.csv")
#str(res.use.ind)
lm8.data <- merge(x = res.use.ind, y = exam.scores %>% select(USER_ID, SC_FE_TOT),
by.x = "user_id", by.y = "USER_ID", all.x = TRUE, all.y = FALSE)
#summary(lm8.data)
# remove students who do not have final exam score
lm8.data <- lm8.data %>% filter( is.na(SC_FE_TOT)==FALSE )
lm8.data <- lm8.data %>% select(-user_id)
# examime correlations
plot.correlations(lm8.data)
# video_ind and MCQ_ind are highly correlated, remove one of them
lm8.data <- lm8.data %>% select(-VIDEO_ind)
lm8 <- lm(SC_FE_TOT ~ ., data = lm8.data)
summary(lm8)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm8.data)
Residuals:
Min 1Q Median 3Q Max
-22.7333 -5.8524 -0.1573 6.1593 21.4361
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 16.10045 1.07114 15.031 < 2e-16 ***
MCQ_ind 0.95088 0.16395 5.800 1.22e-08 ***
EXE_ind -0.82529 0.14298 -5.772 1.42e-08 ***
RES_ind 0.60658 0.14833 4.089 5.08e-05 ***
METACOG_ind 0.03796 0.15725 0.241 0.809
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.38 on 472 degrees of freedom
Multiple R-squared: 0.2739, Adjusted R-squared: 0.2678
F-statistic: 44.52 on 4 and 472 DF, p-value: < 2.2e-16
Significant predictors:
R-squared: 0.274 (adjusted R2: 0.268).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm8$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm8)
par(mfrow=c(1, 1))
# both normality and homoscedasticity requirements are fulfilled
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:4)
print(cor.test(lm8.data[,c], lm8$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm8)
# it's fine
Indicators are computed at the week level, based on the following principle: a score of one is given to a student (for a given week), if his/her number of events related to a particular topic type (e.g. revisiting) was above the average (median) number of events with that topic type in the given week
Weeks 6 and 13 are excluded from these computations, as during these weeks one can expect different behavioral patterns than usual.
Loading the data
topic.ind <- read.csv("Intermediate_results/regularity_of_study/topic_based_indicators_w2-5_7-12.csv")
#str(topic.ind)
lm9.data <- merge(x = topic.ind, y = exam.scores %>% select(USER_ID, SC_FE_TOT),
by.x = "user_id", by.y = "USER_ID", all.x = TRUE, all.y = FALSE)
#summary(lm9.data)
# remove students who do not have final exam score
lm9.data <- lm9.data %>% filter( is.na(SC_FE_TOT)==FALSE )
lm9.data <- lm9.data %>% select(-user_id)
plot.correlations(lm9.data)
# orient_ind and metacog_ind are highly correlated, remove one of them
lm9.data <- lm9.data %>% select(-orient_ind)
lm9 <- lm(SC_FE_TOT ~ ., data = lm9.data)
summary(lm9)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm9.data)
Residuals:
Min 1Q Median 3Q Max
-21.7208 -6.8879 -0.8953 7.0348 23.7086
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 13.2952 1.0797 12.314 < 2e-16 ***
ontopic_ind 0.9303 0.1693 5.496 6.38e-08 ***
revisit_ind -0.2917 0.1714 -1.702 0.0893 .
metacog_ind 0.4606 0.2082 2.212 0.0274 *
prj_ind 0.2421 0.3127 0.774 0.4392
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 9.076 on 472 degrees of freedom
Multiple R-squared: 0.1484, Adjusted R-squared: 0.1412
F-statistic: 20.56 on 4 and 472 DF, p-value: 1.236e-15
Significant predictors:
R-squared: 0.1484 (adjusted R-squared: 0.1412)
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm9$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm9)
par(mfrow=c(1, 1))
# both normality and homoscedasticity requirements are fulfilled
# there are few outliers (202, 213, 365), but no influential points
# check the outliers
lm9.data[c(202,213,365),]
# very interesting:
# - 202 was highly active but ended up with zero final exam score
# - 213 and 365 were not preparing for lectures, and generaly had low engagement, but did the exam excellently
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:4)
print(cor.test(lm9.data[,c], lm9$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm9)
# it's fine
Use those indicators that proved significant in the previous two models (models 8 and 9)
Loading the data
topic.ind <- read.csv("Intermediate_results/regularity_of_study/topic_based_indicators_w2-5_7-12.csv")
res.use.ind <- read.csv("Intermediate_results/regularity_of_study/res_use_indicators_w2-13.csv")
lm10.data <- merge(x = topic.ind %>% select(user_id, ontopic_ind, metacog_ind),
y = res.use.ind %>% select(user_id, MCQ_ind, RES_ind, EXE_ind),
by = "user_id", all = TRUE)
lm10.data <- merge(x = lm10.data, y = exam.scores %>% select(USER_ID, SC_FE_TOT),
by.x = "user_id", by.y = "USER_ID", all.x = TRUE, all.y = FALSE)
summary(lm10.data)
# remove students who do not have final exam score
lm10.data <- lm10.data %>% filter( is.na(SC_FE_TOT)==FALSE )
plot.correlations(lm10.data %>% select(-user_id))
# RES_ind and metacog_ind are highly correlated, remove metacog_ind as it was less significant in the previous models
lm10.data <- lm10.data %>% select(-metacog_ind)
lm10 <- lm(SC_FE_TOT ~ ., data = lm10.data %>% select(-user_id))
summary(lm10)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm10.data %>% select(-user_id))
Residuals:
Min 1Q Median 3Q Max
-21.6264 -5.5145 -0.4311 6.2478 22.6760
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 16.0064 1.0433 15.343 < 2e-16 ***
ontopic_ind 0.4506 0.2055 2.192 0.028843 *
MCQ_ind 0.7235 0.1851 3.908 0.000107 ***
RES_ind 0.5582 0.1492 3.742 0.000205 ***
EXE_ind -0.9275 0.1496 -6.199 1.24e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.339 on 472 degrees of freedom
Multiple R-squared: 0.2812, Adjusted R-squared: 0.2751
F-statistic: 46.15 on 4 and 472 DF, p-value: < 2.2e-16
All 4 predictors are significant; however, only slight improvement in R2 w.r.t. model 8: R-squared: 0.2812 (adjusted R-squared: 0.2751)
Loading the data
weekly.sessions <- read.csv("Intermediate_results/regularity_of_study/weekly_session_props.csv")
lm11.data <- merge(x = lm10.data, y = weekly.sessions %>% select(user_id, s_total, weekly_entropy),
by = 'user_id', all.x = TRUE, all.y = FALSE)
#summary(lm11.data)
lm11.data <- lm11.data[,c(1:5,7,8,6)]
plot.correlations(lm11.data %>% select(-user_id))
# total number of sessions is highly correlated with almost all other variables
lm11.data <- lm11.data %>% select(-s_total)
lm11 <- lm(SC_FE_TOT ~ ., data = lm11.data %>% select(-c(user_id, ontopic_ind)))
summary(lm11)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm11.data %>% select(-c(user_id,
ontopic_ind)))
Residuals:
Min 1Q Median 3Q Max
-24.1085 -5.7500 -0.1444 5.9306 20.1971
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 23.6142 1.5988 14.770 < 2e-16 ***
MCQ_ind 0.8916 0.1438 6.200 1.23e-09 ***
RES_ind 0.4795 0.1444 3.320 0.000969 ***
EXE_ind -0.9923 0.1406 -7.060 6.00e-12 ***
weekly_entropy 42.6242 7.0907 6.011 3.69e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.077 on 472 degrees of freedom
Multiple R-squared: 0.3255, Adjusted R-squared: 0.3198
F-statistic: 56.94 on 4 and 472 DF, p-value: < 2.2e-16
All 4 predictors that were eventually used for model building - MCQ_ind, EXE_ind, RES_ind, and weekly_entropy - proved highly significant.
R-squared: 0.3255 (adjusted R-squared: 0.3198)
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm11$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm11)
par(mfrow=c(1, 1))
# normality is fulfilled, but the homoscedasticity requirements is questionable
check.homoschedasticity(lm11)
# not good, there outliers and/or influential points
# check the outliers
lm11.data[c(49,294,50),]
# - 294 and 50: low to moderate activity indicators and zero final exam score
# - 230: moderate activity indicators, but excellent exam score
# check influential points
inf.indices <- as.numeric(names(head(sort(cooks.distance(lm11), decreasing = T))))
lm11.data[inf.indices,]
# observations with ordinal numbers 163, 241, 336 should be considered for removal
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:5)
print(cor.test(lm11.data[,c], lm11$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm11)
## ontopic_ind and MCQ_ind have values > 2; remove ontopic_ind as it is not significant
# now (after ontopic_ind was removed), it's fine
If the model is to be used, the outliers should be dealt with.
In addition to predictors from Model 10 and weekday entropy of study session counts, use, as predictors, study session counts for those week days that proved as significant predictors in Model 5 (Mon, Tue, Wed, Thu).
Loading the data…
weekday.sessions <- read.csv("Intermediate_results/regularity_of_study/weekday_session_props.csv")
lm12.data <- merge(x = lm10.data,
y = weekday.sessions %>% select(user_id, Mon_count, Tue_count, Wed_count,
Thu_count, weekday_entropy),
by = "user_id", all.x = TRUE, all.y = FALSE)
# summary(lm12.data)
lm12.data <- lm12.data[,c(1:5,7:11,6)]
plot.correlations(lm12.data %>% select(-user_id))
lm12 <- lm(SC_FE_TOT ~ ., data = lm12.data %>% select(-c(user_id, ontopic_ind, RES_ind)))
summary(lm12)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm12.data %>% select(-c(user_id,
ontopic_ind, RES_ind)))
Residuals:
Min 1Q Median 3Q Max
-26.9153 -5.6781 -0.2458 5.9728 21.6389
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 22.93612 2.25082 10.190 < 2e-16 ***
MCQ_ind 0.59566 0.15805 3.769 0.000185 ***
EXE_ind -1.00543 0.14347 -7.008 8.45e-12 ***
Mon_count 0.14117 0.04520 3.123 0.001901 **
Tue_count 0.11040 0.03598 3.068 0.002278 **
Wed_count 0.08320 0.04090 2.034 0.042474 *
Thu_count 0.12923 0.04152 3.112 0.001970 **
weekday_entropy 26.94642 6.01675 4.479 9.45e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.085 on 469 degrees of freedom
Multiple R-squared: 0.3285, Adjusted R-squared: 0.3185
F-statistic: 32.78 on 7 and 469 DF, p-value: < 2.2e-16
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm12$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm12)
par(mfrow=c(1, 1))
# both normality and homoscedasticity requirements are fulfilled
# there are few outliers (230, 50, 459), but they do not look that significant
# check influential points
inf.indices <- as.numeric(names(head(sort(cooks.distance(lm12), decreasing = T))))
lm12.data[inf.indices,]
# observations with ordinal numbers 459, 22, 292 should be considered for removal
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:10)
print(cor.test(lm12.data[,c], lm12$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm12)
## ontopic_ind has value > 2; remove it
## now RES_ind stands out with high value; remove it
# now (after ontopic_ind and RES_ind were removed), it's fine
Loading the data…
weekly.sessions <- read.csv("Intermediate_results/regularity_of_study/weekly_session_props.csv")
lm13.data <- merge(x = lm12.data %>% select(-c(ontopic_ind, RES_ind)),
y = weekly.sessions %>% select(user_id, weekly_entropy),
by = 'user_id', all.x = TRUE, all.y = FALSE)
#summary(lm13.data)
lm13.data <- lm13.data[,c(1:8,10,9)]
plot.correlations(lm13.data %>% select(-user_id))
lm13 <- lm(SC_FE_TOT ~ ., data = lm13.data %>% select(-user_id))
summary(lm13)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm13.data %>% select(-user_id))
Residuals:
Min 1Q Median 3Q Max
-25.4716 -5.6469 -0.3401 5.5829 20.8140
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 25.49497 2.31501 11.013 < 2e-16 ***
MCQ_ind 0.64099 0.15619 4.104 4.79e-05 ***
EXE_ind -1.05966 0.14208 -7.458 4.29e-13 ***
Mon_count 0.12474 0.04475 2.788 0.005526 **
Tue_count 0.08943 0.03587 2.493 0.013013 *
Wed_count 0.07164 0.04041 1.773 0.076912 .
Thu_count 0.10852 0.04127 2.630 0.008832 **
weekday_entropy 17.59079 6.40524 2.746 0.006259 **
weekly_entropy 29.83472 7.72548 3.862 0.000128 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 7.968 on 468 degrees of freedom
Multiple R-squared: 0.3492, Adjusted R-squared: 0.3381
F-statistic: 31.39 on 8 and 468 DF, p-value: < 2.2e-16
R-squared: 0.349 (adjusted R-squared: 0.338)
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm13$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm13)
par(mfrow=c(1, 1))
check.homoschedasticity(lm13)
# normality is fulfilled
# homoscedasticity is somewhat questionable - there are outliers and/or influential points
# outliers: 230, 50, 459
# check influential points
inf.indices <- head(sort(cooks.distance(lm13), decreasing = T))
inf.indices
lm13.data[as.numeric(names(inf.indices)),]
# observations with ordinal numbers 459, 163, 336, and 241 should be considered for removal (all have final exam score = zero)
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:10)
print(cor.test(lm13.data[,c], lm13$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm13)
# it's fine
Assessing R2 shrinkage using 10-Fold Cross-Validation (following guidance from http://www.statmethods.net/stats/regression.html)
# define functions
theta.fit <- function(x,y){lsfit(x,y)}
theta.predict <- function(fit,x){cbind(1,x)%*%fit$coef}
# matrix of predictors
X <- as.matrix(lm13.data %>% select(-c(user_id, SC_FE_TOT)))
# vector of predicted values
y <- as.matrix(lm13.data %>% select(SC_FE_TOT))
results <- crossval(X, y, theta.fit, theta.predict, ngroup=10)
cor(y, lm13$fitted.values)**2 # raw R2
[,1]
SC_FE_TOT 0.3492284
cor(y, results$cv.fit)**2 # cross-validated R2
[,1]
SC_FE_TOT 0.3232305
Raw R-squared: 0.349 Cross-validated R-squared: 0.3243
Compute cross-validated standard error of prediction (following guidance from: http://www.statmethods.net/stats/regression.html)
cv.out <- cv.lm(data=lm13.data, form.lm = lm13, m = 10) # 10 fold cross-validation
Analysis of Variance Table
Response: SC_FE_TOT
Df Sum Sq Mean Sq F value Pr(>F)
MCQ_ind 1 9567 9567 150.69 < 2e-16 ***
EXE_ind 1 1753 1753 27.61 2.3e-07 ***
Mon_count 1 680 680 10.72 0.00114 **
Tue_count 1 522 522 8.22 0.00433 **
Wed_count 1 508 508 8.01 0.00486 **
Thu_count 1 656 656 10.34 0.00139 **
weekday_entropy 1 1311 1311 20.65 7.0e-06 ***
weekly_entropy 1 947 947 14.91 0.00013 ***
Residuals 468 29711 63
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As there is >1 explanatory variable, cross-validation
predicted values for a fold are not a linear function
of corresponding overall predicted values. Lines that
are shown for the different folds are approximate
fold 1
Observations in test set: 47
3 10 13 20 27 40 48 70 77 81 95 99 106
Predicted 19.49 22.35 22.89 19.1 13.53 16.95 18.11 28.04 5.33 8.528 30.4 14.62 18.23
cvpred 19.53 22.25 22.89 19.4 13.68 16.92 18.05 28.12 5.02 8.494 30.6 14.96 18.47
SC_FE_TOT 24.00 22.00 25.00 8.0 10.00 11.00 14.00 34.00 0.00 9.000 20.0 23.00 15.00
CV residual 4.47 -0.25 2.11 -11.4 -3.68 -5.92 -4.05 5.88 -5.02 0.506 -10.6 8.04 -3.47
119 125 139 152 154 159 172 188 191 196 210 211 225
Predicted 16.060 20.18 25.03 23.5 25.43 23.62 23.97 20.41 7.76 12.19 22.6 22.6 16.30
cvpred 15.711 20.86 24.96 23.3 25.93 23.35 24.45 20.45 7.58 12.46 22.6 22.6 15.98
SC_FE_TOT 16.000 15.00 20.00 11.0 18.00 18.00 33.00 28.00 10.00 4.00 26.0 33.0 25.00
CV residual 0.289 -5.86 -4.96 -12.3 -7.93 -5.35 8.55 7.55 2.42 -8.46 3.4 10.4 9.02
233 242 248 265 269 270 300 301 310 314 326 336 340
Predicted 18.72 22.319 24.1 22.71 31.37 26.4 17.1 27.8 17.91 22.66 17.10 -7.25 21.0
cvpred 19.02 22.456 24.6 23.52 30.95 26.3 16.8 27.5 18.74 22.83 17.69 -8.35 21.2
SC_FE_TOT 22.00 22.000 8.0 17.00 40.00 39.0 32.0 39.0 16.00 16.00 9.00 0.00 35.0
CV residual 2.98 -0.456 -16.6 -6.52 9.05 12.7 15.2 11.5 -2.74 -6.83 -8.69 8.35 13.8
357 359 379 395 408 410 436 446
Predicted 20.38 18.0 14.14 17.12 20.6 13.778 20.20 14.87
cvpred 20.58 18.3 14.45 16.63 20.9 14.007 20.06 15.24
SC_FE_TOT 11.00 0.0 12.00 19.00 10.0 15.000 15.00 13.00
CV residual -9.58 -18.3 -2.45 2.37 -10.9 0.993 -5.06 -2.24
Sum of squares = 3094 Mean square = 65.8 n = 47
fold 2
Observations in test set: 48
7 34 42 67 73 75 104 105 114 135 138 142 143
Predicted 19.42 29.2992 29.8 12.89 23.62 18.03 11.80 19.22 14.6 16.08 26.3 26.40 29.00
cvpred 19.31 29.0167 29.3 12.84 23.13 17.98 11.79 19.08 14.6 16.02 25.8 26.03 28.41
SC_FE_TOT 16.00 29.0000 19.0 9.00 21.00 23.00 18.00 26.00 7.0 14.00 38.0 32.00 37.00
CV residual -3.31 -0.0167 -10.3 -3.84 -2.13 5.02 6.21 6.92 -7.6 -2.02 12.2 5.97 8.59
149 150 174 184 186 190 201 204 209 212 223 224 236 243
Predicted 24.68 19.7 12.91 19.6 13.8 21.8 15.09 22.02 32.64 19.4 28.04 27.3 17.62 33.23
cvpred 24.18 19.6 12.77 19.3 13.7 21.7 15.13 21.81 32.28 19.3 27.42 26.8 17.42 32.63
SC_FE_TOT 29.00 34.0 17.00 34.0 29.0 37.0 12.00 19.00 38.00 17.0 33.00 38.0 20.00 39.00
CV residual 4.82 14.4 4.23 14.7 15.3 15.3 -3.13 -2.81 5.72 -2.3 5.58 11.2 2.58 6.37
245 253 262 272 280 282 283 299 324 343 349 351
Predicted 22.07 13.54 11.62 30.90 9.07 20.83 17.702 25.05 23.73 17.86 23.0 21.00
cvpred 22.22 13.53 11.46 30.35 9.02 20.68 17.551 24.67 23.47 17.79 22.9 20.69
SC_FE_TOT 20.00 15.00 8.00 33.00 13.00 27.00 17.000 21.00 33.00 23.00 8.0 19.00
CV residual -2.22 1.47 -3.46 2.65 3.98 6.32 -0.551 -3.67 9.53 5.21 -14.9 -1.69
364 368 372 381 386 392 399 402 453
Predicted 18.81 10.5 19.2 23.69 14.32 31.07 26.41 27.24 23.26
cvpred 18.56 10.7 19.1 23.27 14.19 30.49 25.95 26.79 22.91
SC_FE_TOT 20.00 7.0 13.0 19.00 8.00 26.00 31.00 35.00 26.00
CV residual 1.44 -3.7 -6.1 -4.27 -6.19 -4.49 5.05 8.21 3.09
Sum of squares = 2434 Mean square = 50.7 n = 48
fold 3
Observations in test set: 48
15 22 30 36 45 60 84 90 97 123 127 131 133
Predicted 18.5 25.5 22.4 11.88 7.82 12.273 13.27 21.5 14.9 19.2 24.74 23.00 29.12
cvpred 18.4 26.6 22.8 12.38 7.86 11.123 12.59 21.0 14.4 19.1 23.51 21.97 28.24
SC_FE_TOT 38.0 9.0 36.0 20.00 5.00 12.000 18.00 32.0 29.0 9.0 33.00 30.00 38.00
CV residual 19.6 -17.6 13.2 7.62 -2.86 0.877 5.41 11.0 14.6 -10.1 9.49 8.03 9.76
141 144 151 164 170 173 175 193 200 241 250 255 261
Predicted 18.5 25.33 11.3 25.61 32.29 23.6 16.40 24.90 28.2 -7.20 19.93 16.21 16.2
cvpred 17.8 25.29 11.1 25.39 32.39 22.8 16.11 24.52 28.4 -8.85 19.22 15.71 16.2
SC_FE_TOT 21.0 30.00 24.0 33.00 35.00 34.0 9.00 26.00 17.0 0.00 26.00 12.00 8.0
CV residual 3.2 4.71 12.9 7.61 2.61 11.2 -7.11 1.48 -11.4 8.85 6.78 -3.71 -8.2
266 275 279 293 332 347 348 353 369 371 373 391 405
Predicted 19.84 33.08 16.12 18.25 19.903 29.2 21.44 15.88 24.0 21.59 17.3 30.32 27.49
cvpred 19.87 32.36 16.15 17.74 19.375 27.8 21.11 15.34 23.7 21.08 17.1 30.57 27.24
SC_FE_TOT 10.00 38.00 8.00 21.00 20.000 33.0 15.00 17.00 38.0 24.00 20.0 33.00 33.00
CV residual -9.87 5.64 -8.15 3.26 0.625 5.2 -6.11 1.66 14.3 2.92 2.9 2.43 5.76
415 417 419 435 450 457 458 472 477
Predicted 21.0 28.81 15.66 29.4 24.14 24.05 11.4 14.44 13.71
cvpred 21.6 29.26 15.42 28.7 24.38 24.89 11.3 13.86 13.61
SC_FE_TOT 12.0 27.00 11.00 33.0 16.00 17.00 22.0 9.00 16.00
CV residual -9.6 -2.26 -4.42 4.3 -8.38 -7.89 10.7 -4.86 2.39
Sum of squares = 3374 Mean square = 70.3 n = 48
fold 4
Observations in test set: 48
16 23 55 63 65 69 72 74 79 109 111 117 118
Predicted 17.1 10.42 15.51 20.25 16.42 18.58 8.36 21.67 19.75 14.01 25.43 15.29 15.88
cvpred 17.8 10.63 15.84 20.07 16.22 18.81 8.83 21.18 19.86 14.32 25.26 15.24 15.75
SC_FE_TOT 0.0 16.00 9.00 28.00 11.00 14.00 7.00 19.00 14.00 6.00 27.00 12.00 21.00
CV residual -17.8 5.37 -6.84 7.93 -5.22 -4.81 -1.83 -2.18 -5.86 -8.32 1.74 -3.24 5.25
121 140 182 199 226 257 288 291 296 303 311 316
Predicted 14.14 17.6 14.44 23.28 17.926 15.659 21.95 16.4 16.6 15.02 16.759 19.77
cvpred 13.96 17.3 14.44 22.97 17.841 15.557 21.87 16.0 16.9 14.82 16.685 20.81
SC_FE_TOT 22.00 31.0 16.00 24.00 18.000 15.000 27.00 28.0 0.0 7.00 16.000 25.00
CV residual 8.04 13.7 1.56 1.03 0.159 -0.557 5.13 12.0 -16.9 -7.82 -0.685 4.19
325 327 328 341 352 354 365 370 376 387 388 416 423
Predicted 16.346 15.7 17.52 18.88 22.38 26.74 21.1 19.91 17.5 15.3 24.22 23.63 29.77
cvpred 16.937 15.7 17.75 19.13 22.69 26.27 21.0 20.47 17.6 15.0 24.35 23.42 29.51
SC_FE_TOT 16.000 28.0 14.00 26.00 28.00 34.00 36.0 14.00 4.0 25.0 27.00 27.00 37.00
CV residual -0.937 12.3 -3.75 6.87 5.31 7.73 15.0 -6.47 -13.6 10.0 2.65 3.58 7.49
424 428 432 433 440 448 452 462 464 470
Predicted 28.3 23.60 27.60 16.20 24.49 29.07 19.43 16.7 18.08 32.40
cvpred 28.2 23.31 27.56 16.02 24.46 28.66 19.67 17.2 18.22 32.21
SC_FE_TOT 34.0 21.00 25.00 15.00 20.00 26.00 17.00 0.0 15.00 34.00
CV residual 5.8 -2.31 -2.56 -1.02 -4.46 -2.66 -2.67 -17.2 -3.22 1.79
Sum of squares = 2769 Mean square = 57.7 n = 48
fold 5
Observations in test set: 48
6 12 14 31 33 39 47 54 61 68 78 85
Predicted 13.94 13.28 32.18 19.00 21.56 21.4 23.6 16.4 13.954 17.00 16.779 16.46
cvpred 13.74 13.41 32.83 19.77 22.26 21.2 23.3 16.7 13.878 17.33 16.607 16.48
SC_FE_TOT 11.00 11.00 40.00 14.00 20.00 9.0 37.0 5.0 14.000 9.00 16.000 24.00
CV residual -2.74 -2.41 7.17 -5.77 -2.26 -12.2 13.7 -11.7 0.122 -8.33 -0.607 7.52
93 98 107 110 126 156 161 194 198 203 216 229 231
Predicted 22.00 19.25 23.5 24.7 21.76 22.22 12.1 12.4 16.7 21.23 13.5 24.36 22.40
cvpred 22.13 19.44 24.0 24.8 22.08 22.46 12.2 12.6 16.7 21.03 13.4 24.88 22.33
SC_FE_TOT 28.00 16.00 13.0 12.0 28.00 32.00 27.0 0.0 5.0 18.00 27.0 21.00 15.00
CV residual 5.87 -3.44 -11.0 -12.8 5.92 9.54 14.8 -12.6 -11.7 -3.03 13.6 -3.88 -7.33
232 234 244 249 267 294 297 305 320 321 330 346 367
Predicted 30.77 17.374 15.15 20.79 20.86 20.1 16.76 14.9 19.74 17.0 12.84 15.90 15.48
cvpred 31.81 17.691 15.42 21.07 20.72 20.1 16.89 15.0 19.87 17.4 13.01 15.76 15.94
SC_FE_TOT 34.00 18.000 11.00 13.00 29.00 0.0 12.00 30.0 22.00 33.0 7.00 17.00 9.00
CV residual 2.19 0.309 -4.42 -8.07 8.28 -20.1 -4.89 15.0 2.13 15.6 -6.01 1.24 -6.94
380 389 393 403 404 412 422 459 460 474
Predicted 21.0 25.24 25.77 23.45 19.6 24.6 13.47 25.5 13.232 22.66
cvpred 20.6 25.56 26.37 23.37 19.9 24.9 13.38 26.9 13.133 22.97
SC_FE_TOT 30.0 32.00 29.00 21.00 7.0 7.0 15.00 0.0 13.000 17.00
CV residual 9.4 6.44 2.63 -2.37 -12.9 -17.9 1.62 -26.9 -0.133 -5.97
Sum of squares = 4471 Mean square = 93.2 n = 48
fold 6
Observations in test set: 48
2 4 21 28 38 46 62 86 91 92 108 113 129
Predicted 32.13 10.40 16.35 13.94 19.6 13.8 15.80 13.7 18.6 16.99 14.55 23.96 23.6
cvpred 31.85 10.75 16.16 13.97 19.3 13.5 15.64 13.6 18.8 17.21 14.44 23.73 23.6
SC_FE_TOT 39.00 15.00 8.00 7.00 37.0 0.0 9.00 33.0 13.0 15.00 13.00 22.00 34.0
CV residual 7.15 4.25 -8.16 -6.97 17.7 -13.5 -6.64 19.4 -5.8 -2.21 -1.44 -1.73 10.4
130 145 157 165 168 179 181 208 217 220 239 240 252
Predicted 26.47 16.95 21.4 17.3 31.24 31.10 15.958 19.87 13.51 23.70 22.17 27.00 19.6
cvpred 26.22 16.91 20.9 17.5 30.86 30.42 16.015 19.96 13.38 23.49 22.18 26.74 19.1
SC_FE_TOT 28.00 15.00 33.0 11.0 33.00 39.00 17.000 23.00 18.00 15.00 30.00 29.00 30.0
CV residual 1.78 -1.91 12.1 -6.5 2.14 8.58 0.985 3.04 4.62 -8.49 7.82 2.26 10.9
277 278 295 306 307 319 323 337 342 345 382 397 401
Predicted 21.33 20.2 23.7 20.9 11.921 12.1 19.74 22.49 14.79 16.85 17.7 19.2 23.56
cvpred 20.83 20.1 23.6 20.3 11.567 12.2 19.54 22.83 14.85 16.78 17.6 19.1 23.32
SC_FE_TOT 18.00 8.0 33.0 27.0 11.000 23.0 24.00 13.00 13.00 11.00 25.0 30.0 18.00
CV residual -2.83 -12.1 9.4 6.7 -0.567 10.8 4.46 -9.83 -1.85 -5.78 7.4 10.9 -5.32
407 409 418 421 434 442 444 455 463
Predicted 18.62 26.64 20.8 27.072 15.84 25.14 26.54 11.02 22.94
cvpred 18.66 27.02 21.2 26.906 15.47 24.98 26.34 10.82 22.98
SC_FE_TOT 14.00 29.00 10.0 26.000 17.00 29.00 24.00 17.00 14.00
CV residual -4.66 1.98 -11.2 -0.906 1.53 4.02 -2.34 6.18 -8.98
Sum of squares = 2870 Mean square = 59.8 n = 48
fold 7
Observations in test set: 48
17 24 25 53 58 59 66 87 94 102 103 115
Predicted 23.81 18.96 12.83 15.2 8.11 15.447 13.805 24.84 18.6 21.78 21.20 15.99
cvpred 23.63 18.94 13.47 15.6 8.69 16.003 14.292 24.61 19.1 21.31 21.04 16.56
SC_FE_TOT 30.00 20.00 11.00 3.0 10.00 17.000 14.000 28.00 7.0 30.00 19.00 9.00
CV residual 6.37 1.06 -2.47 -12.6 1.31 0.997 -0.292 3.39 -12.1 8.69 -2.04 -7.56
120 146 147 155 163 169 187 189 195 221 259 268 284
Predicted 15.9 16.23 18.81 15.1 -9.37 23.8 16.14 15.49 25.9 22.2 6.01 8.63 22.84
cvpred 16.1 16.18 18.62 15.2 -10.18 23.9 16.38 15.64 25.5 22.5 6.98 8.06 22.35
SC_FE_TOT 27.0 13.00 17.00 4.0 0.00 25.0 12.00 6.00 33.0 36.0 0.00 0.00 20.00
CV residual 10.9 -3.18 -1.62 -11.2 10.18 1.1 -4.38 -9.64 7.5 13.5 -6.98 -8.06 -2.35
285 308 309 313 318 329 333 335 350 355 356 375 378
Predicted 17.114 26.10 15.907 20.13 15.84 20.96 18.31 15.33 22.7 27.50 21.78 29.49 22.3
cvpred 17.288 25.69 16.149 20.08 16.14 21.38 18.79 13.65 22.8 27.14 21.65 29.37 21.4
SC_FE_TOT 17.000 27.00 17.000 24.00 18.00 15.00 14.00 22.00 34.0 30.00 20.00 23.00 34.0
CV residual -0.288 1.31 0.851 3.92 1.86 -6.38 -4.79 8.35 11.2 2.86 -1.65 -6.37 12.6
400 420 427 430 447 449 451 465 467 473
Predicted 26.89 26.728 24.51 15.70 19.6 20.67 18.52 15.09 15.27 14.25
cvpred 26.54 26.335 24.59 15.87 19.7 20.59 18.87 15.35 15.34 14.79
SC_FE_TOT 31.00 26.000 26.00 15.00 24.0 16.00 12.00 8.00 12.00 11.00
CV residual 4.46 -0.335 1.41 -0.87 4.3 -4.59 -6.87 -7.35 -3.34 -3.79
Sum of squares = 2013 Mean square = 41.9 n = 48
fold 8
Observations in test set: 48
5 9 18 29 37 44 52 71 76 80 122 134 136
Predicted 19.6 18.87 19.34 30.07 23.14 25.24 18.82 17.51 18.37 15.0 16.79 17.504 9.45
cvpred 20.6 18.96 20.42 30.65 23.97 26.09 19.68 17.88 18.92 15.2 17.34 18.902 9.60
SC_FE_TOT 14.0 25.00 11.00 29.00 14.00 24.00 13.00 11.00 17.00 0.0 10.00 18.000 9.00
CV residual -6.6 6.04 -9.42 -1.65 -9.97 -2.09 -6.68 -6.88 -1.92 -15.2 -7.34 -0.902 -0.60
148 166 167 171 180 183 206 207 214 247 254 256 258
Predicted 11.4 18.5 26.625 18.8 19.36 11.60 21.9 19.4 26.30 16.71 23.12 16.01 26.1
cvpred 11.6 18.8 27.088 19.4 19.54 11.85 22.0 19.7 25.81 17.05 24.14 15.97 26.8
SC_FE_TOT 23.0 10.0 28.000 4.0 21.00 8.00 9.0 8.0 34.00 8.00 16.00 19.00 16.0
CV residual 11.4 -8.8 0.912 -15.4 1.46 -3.85 -13.0 -11.7 8.19 -9.05 -8.14 3.03 -10.8
263 287 298 304 317 322 331 338 344 360 383 384 394
Predicted 14.2 24.41 20.855 12.48 20.23 17.69 28.78 23.63 17.74 27.43 14.8 14.2 27.36
cvpred 15.0 24.75 20.719 12.77 20.78 17.98 29.15 23.77 18.22 28.34 14.7 14.8 27.26
SC_FE_TOT 0.0 23.00 20.000 17.00 22.00 16.00 36.00 14.00 10.00 26.00 23.0 11.0 36.00
CV residual -15.0 -1.75 -0.719 4.23 1.22 -1.98 6.85 -9.77 -8.22 -2.34 8.3 -3.8 8.74
398 425 426 429 438 443 445 454 475
Predicted 22.23 16.85 7.74 26.53 14.21 14.1 26.83 21.15 18.87
cvpred 23.22 16.92 8.40 26.82 14.77 14.4 27.69 21.05 19.68
SC_FE_TOT 21.00 12.00 13.00 18.00 5.00 13.0 21.00 16.00 10.00
CV residual -2.22 -4.92 4.60 -8.82 -9.77 -1.4 -6.69 -5.05 -9.68
Sum of squares = 2784 Mean square = 58 n = 48
fold 9
Observations in test set: 47
1 8 19 41 49 50 51 56 57 89 96 116 128
Predicted 25.8 21.22 25.2 15.24 16.9 24.1 15.6 18.40 17.03 23.55 10.67 16.36 15.66
cvpred 25.9 21.42 25.3 15.03 16.8 24.1 15.4 19.21 17.68 23.45 10.41 16.78 15.64
SC_FE_TOT 12.0 20.00 34.0 10.00 37.0 0.0 31.0 13.00 15.00 29.00 9.00 10.00 8.00
CV residual -13.9 -1.42 8.7 -5.03 20.2 -24.1 15.6 -6.21 -2.68 5.55 -1.41 -6.78 -7.64
158 162 176 178 192 205 219 222 228 237 264 271 273
Predicted 23.36 25.99 23.154 12.65 18.3509 19.66 22.34 23.7 27.4 15.52 22.75 21.55 16.1
cvpred 23.85 26.44 23.016 12.19 18.0828 20.07 23.22 23.6 27.4 15.11 23.38 21.41 15.7
SC_FE_TOT 15.00 30.00 24.000 14.00 18.0000 12.00 22.00 33.0 30.0 10.00 15.00 23.00 26.0
CV residual -8.85 3.56 0.984 1.81 -0.0828 -8.07 -1.22 9.4 2.6 -5.11 -8.38 1.59 10.3
274 286 289 292 302 315 334 339 358 366 377 390 396
Predicted 19.27 16.92 18.21 14 17.78 20.553 18.1097 18.15 23.34 14.27 14.0 25.46 26.5
cvpred 18.78 16.71 17.84 13 17.03 20.256 18.0112 17.29 23.09 14.27 12.9 26.22 26.2
SC_FE_TOT 13.00 10.00 21.00 33 13.00 21.000 18.0000 11.00 26.00 12.00 26.0 29.00 28.0
CV residual -5.78 -6.71 3.16 20 -4.03 0.744 -0.0112 -6.29 2.91 -2.27 13.1 2.78 1.8
406 411 437 456 466 468 469 476
Predicted 18.01 17.537 20.2 10.64 15.03 14.24 11.83 15.967
cvpred 18.04 18.792 19.6 9.51 14.75 14.52 11.08 15.903
SC_FE_TOT 26.00 19.000 35.0 20.00 13.00 10.00 20.00 15.000
CV residual 7.96 0.208 15.4 10.49 -1.75 -4.52 8.92 -0.903
Sum of squares = 3428 Mean square = 72.9 n = 47
fold 10
Observations in test set: 47
11 26 32 35 43 64 82 83 88 100 101 112
Predicted 15.87 12.17 20.27 14.09 15.6 7.33 13.22 18.11 15.086 19.08 21.42 17.11
cvpred 15.39 11.77 19.97 13.41 16.1 6.36 12.72 18.24 15.194 18.91 21.61 17.02
SC_FE_TOT 14.00 13.00 14.00 17.00 0.0 12.00 20.00 16.00 16.000 23.00 15.00 14.00
CV residual -1.39 1.23 -5.97 3.59 -16.1 5.64 7.28 -2.24 0.806 4.09 -6.61 -3.02
124 132 137 153 160 177 185 197 202 213 215 218 227 230
Predicted 15.15 15.00 20.84 20.49 20.6 14.03 16 16.5 11.1 25.6 20.2 23 21.7 14.2
cvpred 14.74 14.81 20.84 20.53 20.8 13.39 16 16.4 10.2 25.9 19.7 23 21.7 13.8
SC_FE_TOT 19.00 10.00 12.00 29.00 34.0 20.00 18 6.0 0.0 37.0 17.0 13 33.0 35.0
CV residual 4.26 -4.81 -8.84 8.47 13.2 6.61 2 -10.4 -10.2 11.1 -2.7 -10 11.3 21.2
235 238 246 251 260 276 281 290 312 361 362 363 374
Predicted 19.89 12.02 18.9 14.94 20.3 22.30 19.39 18.5 24.1 10.13 17.91 30.33 15.823
cvpred 19.81 11.41 19.3 14.53 20.5 22.85 19.02 18.1 24.3 9.58 18.06 30.86 15.281
SC_FE_TOT 13.00 15.00 0.0 24.00 10.0 18.00 10.00 31.0 12.0 15.00 21.00 28.00 16.000
CV residual -6.81 3.59 -19.3 9.47 -10.5 -4.85 -9.02 12.9 -12.3 5.42 2.94 -2.86 0.719
385 413 414 431 439 441 461 471
Predicted 31.72 15.32 12.1 29.17 26.13 22.5 10.1 17.0
cvpred 32.22 15.24 11.3 28.99 26.46 22.4 9.3 16.9
SC_FE_TOT 36.00 14.00 28.0 33.00 33.00 14.0 21.0 32.0
CV residual 3.78 -1.24 16.7 4.01 6.54 -8.4 11.7 15.1
Sum of squares = 3795 Mean square = 80.8 n = 47
Overall (Sum over all 47 folds)
ms
65.1
# take the square root of the MSE to get the cross-validated standard error of estimate
sqrt(attr(x = cv.out, which = "ms"))
[1] 8.07
sqrt(attr(x = cv.out, which = "ms"))/40
[1] 0.202
Cross-validated standard error of the prediction (of final exam score) is 8.066 (max score is 40).
Features need to be scaled
# check for outliers
apply(lm13.data[,-1], 2, function(x) length(boxplot.stats(x)$out))
MCQ_ind EXE_ind Mon_count Tue_count Wed_count
0 0 12 3 11
Thu_count weekday_entropy weekly_entropy SC_FE_TOT
18 16 33 0
# some features have a lot of outliers: weekly_entropy:33, weekday_entropy:16
## due to outliers, use standardization with median and Interquartile Range (IQR)
scaled.mod13.dat <- data.frame(apply(lm13.data[,-c(1,10)], 2,
function(x) {(x-median(x, na.rm = T))/IQR(x, na.rm = T)} ))
summary(scaled.mod13.dat)
MCQ_ind EXE_ind Mon_count Tue_count Wed_count
Min. :-0.800 Min. :-1.500 Min. :-0.615 Min. :-0.765 Min. :-0.6
1st Qu.:-0.400 1st Qu.:-0.500 1st Qu.:-0.385 1st Qu.:-0.471 1st Qu.:-0.4
Median : 0.000 Median : 0.000 Median : 0.000 Median : 0.000 Median : 0.0
Mean : 0.157 Mean :-0.004 Mean : 0.213 Mean : 0.097 Mean : 0.2
3rd Qu.: 0.600 3rd Qu.: 0.500 3rd Qu.: 0.615 3rd Qu.: 0.529 3rd Qu.: 0.6
Max. : 1.600 Max. : 1.500 Max. : 3.000 Max. : 2.176 Max. : 3.4
Thu_count weekday_entropy weekly_entropy
Min. :-1.00 Min. :-6.58 Min. :-32.2
1st Qu.:-0.46 1st Qu.:-0.56 1st Qu.: -0.6
Median : 0.00 Median : 0.00 Median : 0.0
Mean : 0.13 Mean :-0.19 Mean : -0.6
3rd Qu.: 0.54 3rd Qu.: 0.44 3rd Qu.: 0.4
Max. : 3.62 Max. : 1.18 Max. : 0.9
scaled.mod13.dat <- cbind(scaled.mod13.dat, FE_SCORE=lm13.data$SC_FE_TOT)
Build a RF model
## use caret to build a RF model
library(caret)
tr.control <- trainControl(method ="repeatedcv",
number = 10,
repeats = 5,
search = "grid")
## default for mtry is 1/3 of the number of predictors (14 in this case)
tune.grid <- expand.grid(.mtry=c(2:5))
set.seed(372017)
rf <- train(FE_SCORE ~ .,
data=scaled.mod13.dat,
method="rf",
ntree=3500,
importance = TRUE,
tuneGrid=tune.grid,
trControl=tr.control)
print(rf)
Random Forest
477 samples
8 predictor
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 5 times)
Summary of sample sizes: 428, 428, 430, 431, 429, 428, ...
Resampling results across tuning parameters:
mtry RMSE Rsquared
2 7.99 0.342
3 8.01 0.339
4 8.01 0.339
5 8.02 0.337
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 2.
The best model: mtry=2, RMSE = 7.993946, Rsquared=0.3424
Examine the importance of features
varImp(rf, scale = TRUE)
rf variable importance
Overall
EXE_ind 100.0
MCQ_ind 78.0
weekly_entropy 64.3
Thu_count 47.0
Mon_count 38.8
weekday_entropy 38.2
Tue_count 25.2
Wed_count 0.0